{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import KFold\n",
    "import datetime\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.api.types import is_datetime64_any_dtype as is_datetime\n",
    "from pandas.api.types import is_categorical_dtype\n",
    "\n",
    "def reduce_mem_usage(df, use_float16=False):\n",
    "    \"\"\"\n",
    "    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        \n",
    "    \"\"\"\n",
    "    \n",
    "    start_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage of dataframe is {:.2f} MB\".format(start_mem))\n",
    "    \n",
    "    for col in df.columns:\n",
    "        if is_datetime(df[col]) or is_categorical_dtype(df[col]):\n",
    "            continue\n",
    "        col_type = df[col].dtype\n",
    "        \n",
    "        if col_type != object:\n",
    "            c_min = df[col].min()\n",
    "            c_max = df[col].max()\n",
    "            if str(col_type)[:3] == \"int\":\n",
    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
    "                    df[col] = df[col].astype(np.int8)\n",
    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
    "                    df[col] = df[col].astype(np.int16)\n",
    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
    "                    df[col] = df[col].astype(np.int32)\n",
    "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
    "                    df[col] = df[col].astype(np.int64)  \n",
    "            else:\n",
    "                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
    "                    df[col] = df[col].astype(np.float16)\n",
    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
    "                    df[col] = df[col].astype(np.float32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.float64)\n",
    "        else:\n",
    "            df[col] = df[col].astype(\"category\")\n",
    "\n",
    "    end_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage after optimization is: {:.2f} MB\".format(end_mem))\n",
    "    print(\"Decreased by {:.1f}%\".format(100 * (start_mem - end_mem) / start_mem))\n",
    "    \n",
    "    return df\n",
    "def features_engineering(df):\n",
    "    \n",
    "    # Sort by localtime\n",
    "    df.sort_values(\"local_time\")\n",
    "    df.reset_index(drop=True)\n",
    "    \n",
    "    # Add more features\n",
    "    df[\"local_time\"] = pd.to_datetime(df[\"local_time\"],format=\"%Y-%m-%d %H:%M:%S\")\n",
    "    df[\"hour\"] = df[\"local_time\"].dt.hour\n",
    "    df[\"weekend\"] = df[\"local_time\"].dt.weekday\n",
    "    df['square_feet'] =  np.log1p(df['square_feet'])\n",
    "    \n",
    "    \n",
    "    # Encode Categorical Data\n",
    "    le = LabelEncoder()\n",
    "    df[\"primary_use\"] = le.fit_transform(df[\"primary_use\"])\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 2638.86 MB\n",
      "Memory usage after optimization is: 733.78 MB\n",
      "Decreased by 72.2%\n"
     ]
    }
   ],
   "source": [
    "df_train= pd.read_csv('../../Large_output/train_clean_merge.csv')\n",
    "df_train = reduce_mem_usage(df_train,use_float16=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_engineer = features_engineering(df_train)\n",
    "train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading']\\\n",
    "=train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading'].mul(0.2931)\n",
    "target = np.log1p(df_train[\"meter_reading\"])\n",
    "features = df_train[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\\\n",
    "                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xgboost as xgb\n",
    "from sklearn.metrics import mean_squared_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "del df_train, train_engineer\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\ttrain-rmse:4.09607\tvalid-rmse:4.10874\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 200 rounds.\n",
      "[50]\ttrain-rmse:1.01446\tvalid-rmse:1.08258\n",
      "[100]\ttrain-rmse:0.827756\tvalid-rmse:0.916038\n",
      "[150]\ttrain-rmse:0.771476\tvalid-rmse:0.880163\n",
      "[200]\ttrain-rmse:0.736928\tvalid-rmse:0.863381\n",
      "[250]\ttrain-rmse:0.713559\tvalid-rmse:0.854703\n",
      "[300]\ttrain-rmse:0.69682\tvalid-rmse:0.85111\n",
      "[350]\ttrain-rmse:0.684234\tvalid-rmse:0.84909\n",
      "[400]\ttrain-rmse:0.673897\tvalid-rmse:0.848058\n",
      "[450]\ttrain-rmse:0.665394\tvalid-rmse:0.848331\n",
      "[500]\ttrain-rmse:0.658587\tvalid-rmse:0.848667\n",
      "[550]\ttrain-rmse:0.65175\tvalid-rmse:0.848999\n",
      "[600]\ttrain-rmse:0.64658\tvalid-rmse:0.849286\n",
      "Stopping. Best iteration:\n",
      "[409]\ttrain-rmse:0.672314\tvalid-rmse:0.847955\n",
      "\n",
      "Fold 1 | rmse: 0.8479544520378113\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\ttrain-rmse:4.08945\tvalid-rmse:4.11582\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 200 rounds.\n",
      "[50]\ttrain-rmse:0.954529\tvalid-rmse:1.0678\n",
      "[100]\ttrain-rmse:0.798493\tvalid-rmse:0.923546\n",
      "[150]\ttrain-rmse:0.754606\tvalid-rmse:0.883469\n",
      "[200]\ttrain-rmse:0.729942\tvalid-rmse:0.865462\n",
      "[250]\ttrain-rmse:0.712076\tvalid-rmse:0.853953\n",
      "[300]\ttrain-rmse:0.698203\tvalid-rmse:0.847165\n",
      "[350]\ttrain-rmse:0.686687\tvalid-rmse:0.842022\n",
      "[400]\ttrain-rmse:0.676455\tvalid-rmse:0.838169\n",
      "[450]\ttrain-rmse:0.668135\tvalid-rmse:0.836115\n",
      "[500]\ttrain-rmse:0.661344\tvalid-rmse:0.835401\n",
      "[550]\ttrain-rmse:0.655651\tvalid-rmse:0.83436\n",
      "[600]\ttrain-rmse:0.650737\tvalid-rmse:0.834422\n",
      "[650]\ttrain-rmse:0.645911\tvalid-rmse:0.834574\n",
      "[700]\ttrain-rmse:0.641362\tvalid-rmse:0.834455\n",
      "[750]\ttrain-rmse:0.637191\tvalid-rmse:0.834429\n",
      "[800]\ttrain-rmse:0.633442\tvalid-rmse:0.834662\n",
      "Stopping. Best iteration:\n",
      "[625]\ttrain-rmse:0.647934\tvalid-rmse:0.834141\n",
      "\n",
      "Fold 2 | rmse: 0.8341410756111145\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/xgboost/core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\ttrain-rmse:4.10932\tvalid-rmse:4.0775\n",
      "Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.\n",
      "\n",
      "Will train until valid-rmse hasn't improved in 200 rounds.\n",
      "[50]\ttrain-rmse:0.984811\tvalid-rmse:1.06839\n",
      "[100]\ttrain-rmse:0.794776\tvalid-rmse:0.937235\n",
      "[150]\ttrain-rmse:0.736237\tvalid-rmse:0.907638\n",
      "[200]\ttrain-rmse:0.706902\tvalid-rmse:0.89719\n",
      "[250]\ttrain-rmse:0.683621\tvalid-rmse:0.889159\n",
      "[300]\ttrain-rmse:0.668755\tvalid-rmse:0.886005\n",
      "[350]\ttrain-rmse:0.654421\tvalid-rmse:0.883309\n",
      "[400]\ttrain-rmse:0.643346\tvalid-rmse:0.881163\n",
      "[450]\ttrain-rmse:0.63436\tvalid-rmse:0.880394\n",
      "[500]\ttrain-rmse:0.627085\tvalid-rmse:0.879984\n",
      "[550]\ttrain-rmse:0.621459\tvalid-rmse:0.879883\n",
      "[600]\ttrain-rmse:0.616395\tvalid-rmse:0.880216\n",
      "[650]\ttrain-rmse:0.611315\tvalid-rmse:0.880725\n",
      "[700]\ttrain-rmse:0.606582\tvalid-rmse:0.881058\n",
      "[750]\ttrain-rmse:0.602257\tvalid-rmse:0.881479\n",
      "Stopping. Best iteration:\n",
      "[553]\ttrain-rmse:0.621229\tvalid-rmse:0.879835\n",
      "\n",
      "Fold 3 | rmse: 0.879835307598114\n",
      "\n",
      "Mean rmse = 0.8539769450823467\n"
     ]
    }
   ],
   "source": [
    "columns = features.columns\n",
    "kf = KFold(n_splits=3)\n",
    "splits = kf.split(features, target)\n",
    "score = 0\n",
    "NFOLDS=3\n",
    "feature_importance_df = pd.DataFrame()\n",
    "out_folder_train_prediction= pd.DataFrame()\n",
    "models=[]\n",
    "\n",
    "# run xgb with different params\n",
    "\n",
    "for fold_n, (train_index, valid_index) in enumerate(splits):\n",
    "    dtrain = xgb.DMatrix(features.iloc[train_index],target.iloc[train_index])\n",
    "    dvalid = xgb.DMatrix(features.iloc[valid_index], target.iloc[valid_index])\n",
    "    y_valid=target.iloc[valid_index]\n",
    "# 1.14\n",
    "#     params = {'eval_metric': 'rmse',\\\n",
    "#               'objective': 'reg:squarederror',\\\n",
    "#               'booster':'gbtree',\\\n",
    "#               'nthread' : 4,\\\n",
    "#               'eta' : 0.05,\\\n",
    "#               'max_leaves': 2000,\\\n",
    "#               'max_depth' : 12,\\\n",
    "#               'subsample' : 0.9,\\\n",
    "#               'colsample_bytree' : 0.9,\\\n",
    "#               'colsample_bylevel' : 0.9,\\\n",
    "#              'gamma':1.0,\\\n",
    "#              'max_bin':500,\\\n",
    "#              'min_child_weight':3.0,\\\n",
    "#              'reg_alpha':0.1,\\\n",
    "#              'reg_lambda':0.1,\\\n",
    "#              'n_gpus': 2}\n",
    "\n",
    "    params = {'eval_metric': 'rmse',\\\n",
    "              'objective': 'reg:squarederror',\\\n",
    "              'booster':'gbtree',\\\n",
    "              'nthread' : 4,\\\n",
    "              'eta' : 0.05,\\\n",
    "              'max_leaves': 1800,\\\n",
    "              'max_depth' : 12,\\\n",
    "              'subsample' : 0.1,\\\n",
    "              'colsample_bytree' : 0.9,\\\n",
    "              'colsample_bylevel' : 0.9,\\\n",
    "              'gamma':0,\\\n",
    "              'max_bin':180,\\\n",
    "              'min_child_weight':3.0,\\\n",
    "              'reg_alpha':2.0,\\\n",
    "              'reg_lambda':0.1,\n",
    "              'tree_method': 'gpu_hist'\n",
    "              'n_gpus': 2}\n",
    " \n",
    "\n",
    "    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]\n",
    "        \n",
    "    model=xgb.train(params, dtrain, 1000, watchlist, maximize=False, early_stopping_rounds = 200, verbose_eval=50)\n",
    "    \n",
    "    y_pred_valid = model.predict(dvalid, ntree_limit=model.best_ntree_limit)\n",
    "\n",
    "    \n",
    "    oof_preds=pd.DataFrame()\n",
    "    oof_preds['train_index']=valid_index\n",
    "    oof_preds['TARGET']= y_pred_valid\n",
    "    oof_preds[\"folder\"]=fold_n + 1\n",
    "    out_folder_train_prediction = pd.concat([out_folder_train_prediction, oof_preds], axis=0)\n",
    "    \n",
    "    fold_importance_df = pd.DataFrame()\n",
    "    fold_importance_df = pd.DataFrame(model.get_fscore().items(), \n",
    "                                      columns=['feature','importance']).sort_values('importance', ascending=False)\n",
    "    fold_importance_df[\"fold\"] = fold_n + 1\n",
    "    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
    "    \n",
    "    print(f\"Fold {fold_n + 1} | rmse: {np.sqrt(mean_squared_error(y_valid, y_pred_valid))}\")\n",
    "    \n",
    "    score += np.sqrt(mean_squared_error(y_valid,y_pred_valid)) / NFOLDS\n",
    "    \n",
    "    models.append(model)\n",
    "          \n",
    "    del dtrain, dvalid, watchlist, y_valid\n",
    "    gc.collect()\n",
    "    \n",
    "print(f\"\\nMean rmse = {score}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "out_folder_train_prediction.to_csv('out_folder_train_prediction_xgb.csv',index= False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.1272133333333334"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# xgb \n",
    "(1.14988+1.07825+ 1.15351)/3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 4771.91 MB\n",
      "Memory usage after optimization is: 1671.69 MB\n",
      "Decreased by 65.0%\n"
     ]
    }
   ],
   "source": [
    "test_feature = pd.read_csv('../../Large_output/test_merge.csv')\n",
    "test_feature = reduce_mem_usage(test_feature)\n",
    "test_feature = features_engineering(test_feature)\n",
    "row_ids = test_feature[['row_id']]\n",
    "test_feature = test_feature[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\\\n",
    "                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "dtest = xgb.DMatrix(test_feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/ipykernel_launcher.py:3: DeprecationWarning: elementwise comparison failed; this will raise an error in the future.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "results = []\n",
    "for model in models:\n",
    "    if  results == []:\n",
    "        results = np.expm1(model.predict(dtest, ntree_limit=model.best_ntree_limit)) / len(models)\n",
    "    else:\n",
    "        results += np.expm1(model.predict(dtest, ntree_limit=model.best_ntree_limit)) / len(models)\n",
    "    del model\n",
    "    gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_feature['meter_reading']=np.clip(results, 0, a_max=None)\n",
    "test_feature.loc[(test_feature['site_id']==0) & \n",
    "                 (test_feature['meter']==0),'meter_reading']=test_feature.loc[(test_feature['site_id']==0) &\n",
    "                                                            (test_feature['meter']==0),'meter_reading'].mul(3.4118)\n",
    "df_result = pd.DataFrame({'row_id': row_ids['row_id'], 'meter_reading': test_feature['meter_reading']})\n",
    "df_result.to_csv('../../Large_output/xgb_bayes_clean.csv',index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
